In [1]:
# Importing necessary libraries
from googleapiclient.discovery import build  # To interact with the YouTube Data API
from youtube_transcript_api import YouTubeTranscriptApi
from collections import Counter
import pandas as pd  # For creating and handling DataFrames
import isodate  # To parse ISO 8601 durations
from collections import Counter  # To count occurrences of keywords
import matplotlib.pyplot as plt  # For visualizing results
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from transformers import pipeline
In [2]:
# YouTube API Key 
API_KEY = 'AIzaSyCUe0mpORyGYYpF-5_XnnoL3Z25gFp_wc8'
In [3]:
# Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)
In [8]:
# Function to search and fetch videos
def fetch_youtube_videos(query, max_results=100):
    video_data = []
    next_page_token = None
    fetched_results = 0

    while fetched_results < max_results:
        results_to_fetch = min(max_results - fetched_results, 50)

        # API Request
        request = youtube.search().list(
            q=query,
            part="snippet",
            type="video",
            maxResults=results_to_fetch,
            pageToken=next_page_token,
        )
        response = request.execute()

        for item in response["items"]:
            video_data.append({
                "video_id": item["id"]["videoId"],
                "title": item["snippet"]["title"],
                "description": item["snippet"]["description"],
                "channel_name": item["snippet"]["channelTitle"],
                "channel_link": f"https://www.youtube.com/channel/{item['snippet']['channelId']}",
                "publish_date": item["snippet"]["publishedAt"],
            })

        fetched_results += len(response["items"])
        next_page_token = response.get("nextPageToken", None)
        if not next_page_token:
            break

    return video_data

# Function to fetch additional video details
def fetch_video_details(video_ids):
    video_details = []
    for i in range(0, len(video_ids), 50):
        chunk = video_ids[i : i + 50]
        request = youtube.videos().list(
            part="contentDetails,statistics", id=",".join(chunk)
        )
        response = request.execute()

        for item in response["items"]:
            duration_seconds = isodate.parse_duration(
                item["contentDetails"]["duration"]
            ).total_seconds()
            views = int(item["statistics"].get("viewCount", 0))
            video_details.append({
                "video_id": item["id"],
                "duration": duration_seconds,
                "views": views,
                "likes": int(item["statistics"].get("likeCount", 0)),
                "comments": int(item["statistics"].get("commentCount", 0)),
                "dislikes": int(item["statistics"].get("dislikeCount", 0)) if "dislikeCount" in item["statistics"] else 0,
                "valid": duration_seconds < 300 and views >= 1000,  # Mark valid videos
            })

    return video_details

# Function to fetch transcripts
def fetch_transcripts(video_ids):
    transcripts = {}
    for video_id in video_ids:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            transcript_text = " ".join([t["text"] for t in transcript])
            transcripts[video_id] = transcript_text
        except Exception:
            transcripts[video_id] = "Transcript not available"
    return transcripts

# Fetch videos based on a search query
query = "Tinnitus UK"
raw_video_data = fetch_youtube_videos(query, max_results=300)  # Fetch more data upfront

# Extract video IDs for further details
video_ids = [video["video_id"] for video in raw_video_data]

# Fetch additional details for the videos
detailed_video_data = fetch_video_details(video_ids)

# Separate valid and backup videos
valid_videos = [video for video in detailed_video_data if video["valid"]]
backup_videos = [video for video in detailed_video_data if not video["valid"]]

# Fetch transcripts for all valid and backup videos
valid_video_ids = [video["video_id"] for video in valid_videos]
backup_video_ids = [video["video_id"] for video in backup_videos]
all_transcripts = fetch_transcripts(valid_video_ids + backup_video_ids)

# Create DataFrames
video_df = pd.DataFrame(raw_video_data)
details_df = pd.DataFrame(valid_videos + backup_videos)  # Combine valid and backup videos
final_df = pd.merge(video_df, details_df, on="video_id")

# Add transcripts to the DataFrame
final_df["transcript"] = final_df["video_id"].map(all_transcripts)

# Filter to ensure at least 100 rows
valid_final_df = final_df[final_df["valid"]].head(100)  # Use only valid rows first
if len(valid_final_df) < 100:
    backup_needed = 100 - len(valid_final_df)
    backup_final_df = final_df[~final_df["valid"]].head(backup_needed)
    final_df = pd.concat([valid_final_df, backup_final_df])
else:
    final_df = valid_final_df

# Display the final DataFrame
from IPython.display import display
display(final_df)
video_id title description channel_name channel_link publish_date duration views likes comments dislikes valid transcript
0 Ofg_TGOY5y8 British Tinnitus Association presents #ThisIsM... For people living with tinnitus, there is no s... Tinnitus UK https://www.youtube.com/channel/UC3ktV24vBKJP3... 2021-02-01T08:21:35Z 35.0 72781 101 0 0 True Transcript not available
1 YyT9ZwWy5Jc One Move for Instant Tinnitus Relief #Shorts Dr. Rowe shows an easy exercise that can give ... SpineCare Decompression and Chiropractic Center https://www.youtube.com/channel/UC2lvPVVm0Mv53... 2023-02-12T14:30:09Z 41.0 1016121 35167 1557 0 True Here's one move for instant tinnitus, or tin-N...
2 5IJwmPGY92c Tinnitus sufferers at risk of isolation say ch... Subscribe to 5 News: http://bit.ly/5NewsSub ▻ ... 5 News https://www.youtube.com/channel/UCsAKRVq2n1vcH... 2019-02-04T21:00:00Z 142.0 1076 0 4 0 True imagine trying to carry out everyday tasks wit...
5 0GLJvPp5Xc8 Where does the cure for tinnitus lie? Leading experts discuss where they think a cur... Tinnitus UK https://www.youtube.com/channel/UC3ktV24vBKJP3... 2019-08-29T13:30:04Z 215.0 11346 128 0 0 True [Music] [Music] so I think there's a lot of wo...
6 BigSLy3Kij0 How to Tackle Tinnitus - Part 1 | This Morning Dr Chris is on hand with all the latest inform... This Morning https://www.youtube.com/channel/UChFsYLqPUyLiZ... 2017-08-21T06:34:17Z 175.0 92407 469 4 0 True we're going to do this tinnitus item and we're...
... ... ... ... ... ... ... ... ... ... ... ... ... ...
258 o4GV-EbnMfI Epley Maneuver to Treat BPPV Dizziness Perform Dix-Hallpike maneuver first to determi... Fauquier ENT https://www.youtube.com/channel/UCdioHiFoafczK... 2021-02-18T04:53:55Z 150.0 3828024 25950 766 0 True Normally only with head movement \ndoes fluid ...
261 bh1X5POvBq8 Tinnitus A brief example of the Tinnitus sounds I exper... Tao Boogie https://www.youtube.com/channel/UCY1MBFvSN9utz... 2016-08-03T15:04:16Z 41.0 2145 26 3 0 True Transcript not available
267 eh0Xq9t-HcA Chris Martin about his Tinnitus For more Coldplay video's, please subscribe! :... ColdplayDaily https://www.youtube.com/channel/UC_nWJmRVJuJLY... 2018-10-16T15:00:05Z 105.0 292690 4248 419 0 True I know you've suffered from tinnitus mmm about...
276 Sye7NynVvfE Surgery for TMJ Pain (headaches, earaches, pop... To download this animation and many more, empl... Nucleus Medical Media https://www.youtube.com/channel/UC85VW73bQLEjs... 2022-02-17T15:00:14Z 191.0 346729 4835 376 0 True Transcript not available
281 hLeZ1rJuDJo Tinnitus relief in 10 seconds Tinnitus #hearing,#acupuncture I wish every ti... Nick Chitty https://www.youtube.com/channel/UC_z7YjP02dMNc... 2019-02-08T14:15:32Z 83.0 58349 466 139 0 True hi guys it's Nick I just wanted to share with ...

100 rows × 13 columns

In [5]:
# Calculate the number of rows and columns
num_rows, num_columns = final_df.shape

# Display the results
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
Number of rows: 100
Number of columns: 13
In [6]:
# Define a function to classify the type of content
def classify_content(title, description, transcript):
    content = f"{title} {description} {transcript}".lower()  # Combine all text fields
    if "treatment" in content:
        return "Treatment"
    elif "relief" in content:
        return "Relief"
    elif "cure" in content:
        return "Cure"
    elif "cause" in content or "causes" in content:
        return "Causes"
    elif "therapy" in content:
        return "Therapy"
    elif "exercise" in content or "exercises" in content:
        return "Exercises"
    elif "stress" in content:
        return "Stress"
    else:
        return "General Information"

# Apply the classification function to each video
final_df["content_type"] = final_df.apply(
    lambda row: classify_content(row["title"], row["description"], row["transcript"]), axis=1
)

# Count the occurrences of each content type
content_counts = final_df["content_type"].value_counts()

# Display the results
print("Most Common Types of Tinnitus-Related Content in YouTube Videos Aimed at UK Audiences:")
print(content_counts)

# Plot the distribution of content types
plt.figure(figsize=(8, 5))
content_counts.plot(kind="bar", color="steelblue")
plt.title("Common Tinnitus-Related Content Types in UK YouTube Videos")
plt.xlabel("Content Type")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Most Common Types of Tinnitus-Related Content in YouTube Videos Aimed at UK Audiences:
content_type
Causes                 37
General Information    26
Treatment              19
Relief                  9
Cure                    6
Stress                  2
Therapy                 1
Name: count, dtype: int64
No description has been provided for this image
In [7]:
# Define a function to classify the scientific grounding of the content
def classify_accuracy(title, description, transcript):
    content = f"{title} {description} {transcript}".lower()  # Combine all text fields
    if "nhs" in content or "doctor" in content or "research" in content or "study" in content or "clinical" in content or "evidence-based" in content:
        return "High Accuracy"
    elif "therapy" in content or "relief" in content or "treatment" in content:
        return "Medium Accuracy"
    elif "cure" in content or "natural" in content or "alternative" in content or "holistic" in content:
        return "Low Accuracy"

# Apply the classification function to each video
final_df["accuracy"] = final_df.apply(
    lambda row: classify_accuracy(row["title"], row["description"], row["transcript"]), axis=1
)

# Plot the accuracy distribution as a violin plot
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.violinplot(x="accuracy", y="views", data=final_df, order=["High Accuracy", "Medium Accuracy", "Low Accuracy"], palette="Set2")
plt.title("Violin Plot: Views Distribution by Accuracy Level")
plt.xlabel("Accuracy Level")
plt.ylabel("Views")
plt.tight_layout()
plt.show()
C:\Users\Dell\AppData\Local\Temp\ipykernel_16132\1579571597.py:21: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="accuracy", y="views", data=final_df, order=["High Accuracy", "Medium Accuracy", "Low Accuracy"], palette="Set2")
No description has been provided for this image
In [12]:
# Define a function to classify the type of source
def classify_source(channel_name, description, transcript):
    content = f"{channel_name} {description} {transcript}".lower()  # Combine all text fields
    if "doctor" in content or "clinic" in content or "healthcare" in content or "medical" in content:
        return "Healthcare Professional"
    elif "experience" in content or "my journey" in content or "personal story" in content:
        return "Tinnitus Sufferer"
    elif "alternative" in content or "natural cure" in content or "holistic" in content:
        return "Alternative Therapy Promoter"
    else:
        return "General or Other"

# Apply the classification function to each video
final_df["source_type"] = final_df.apply(
    lambda row: classify_source(row["channel_name"], row["description"], row["transcript"]), axis=1
)

# Count the occurrences of each source type
source_counts = final_df["source_type"].value_counts()

# Display the results in a pie chart
plt.figure(figsize=(8, 8))
source_counts.plot(
    kind="pie", 
    autopct='%1.1f%%', 
    startangle=90, 
    colors=["#FF9999", "#66B3FF", "#99FF99", "#FFCC99"],
    labels=source_counts.index
)
plt.title("Types of Sources Creating Tinnitus-Related YouTube Videos")
plt.ylabel("")  # Remove default y-axis label
plt.show()
No description has been provided for this image
In [13]:
# Load the BERT sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to analyze tone using BERT
def analyze_tone(transcript):
    if not isinstance(transcript, str) or transcript.strip() == "":
        return "No Transcript", 0.0
    # Perform sentiment analysis using BERT
    result = sentiment_analyzer(transcript[:512])  # Limit to the first 512 tokens
    sentiment = result[0]['label']
    score = result[0]['score']
    # Classify tone based on sentiment and confidence
    if score < 0.7:
        return "Neutral", score
    elif sentiment == "POSITIVE":
        return "Positive", score
    else:
        return "Negative", score

# Function to classify the approach
def classify_approach(title, description, transcript):
    content = f"{title} {description} {transcript}".lower()  # Combine all text fields
    if "doctor" in content or "nhs" in content or "research" in content or "evidence-based" in content:
        return "Scientific/Professional"
    elif "experience" in content or "story" in content or "journey" in content:
        return "Personal/Experiential"
    elif "natural" in content or "alternative" in content or "cure" in content:
        return "Alternative/Promotional"
    else:
        return "Other"

# Apply tone and approach analysis to the dataset
final_df["tone"], final_df["tone_score"] = zip(*final_df["transcript"].apply(analyze_tone))
final_df["approach"] = final_df.apply(
    lambda row: classify_approach(row["title"], row["description"], row["transcript"]), axis=1
)

# Summarize tone distribution
tone_counts = final_df["tone"].value_counts().reset_index()
tone_counts.columns = ["Tone", "Count"]

# Summarize approach distribution
approach_counts = final_df["approach"].value_counts().reset_index()
approach_counts.columns = ["Approach", "Count"]

# Plotly: Tone Distribution
fig_tone = px.bar(
    tone_counts,
    x="Tone",
    y="Count",
    title="Tone Distribution of Videos",
    color="Tone",
    text="Count",
    template="plotly"
)
fig_tone.update_traces(textposition="outside")
fig_tone.show()

# Plotly: Approach Distribution
fig_approach = px.pie(
    approach_counts,
    names="Approach",
    values="Count",
    title="Approach Distribution of Videos",
    hole=0.4,  # Donut chart
    color_discrete_sequence=px.colors.qualitative.Set3
)
fig_approach.show()

# Display the first few rows of analysis
sample_analysis = final_df[["title", "tone", "tone_score", "approach"]].head()

# Create a table using Plotly graph_objects
fig_table = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=["<b>Title</b>", "<b>Tone</b>", "<b>Tone Score</b>", "<b>Approach</b>"],
                fill_color="lightblue",
                align="left",
                font=dict(color="black", size=12)
            ),
            cells=dict(
                values=[
                    sample_analysis["title"],
                    sample_analysis["tone"],
                    sample_analysis["tone_score"],
                    sample_analysis["approach"],
                ],
                fill_color="white",
                align="left",
                font=dict(color="black", size=11)
            )
        )
    ]
)

fig_table.update_layout(title="Sample Analysis of Videos", title_x=0.5)
fig_table.show()
WARNING:tensorflow:From C:\Users\Dell\anaconda3\Lib\site-packages\tf_keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

In [14]:
# Group videos by accuracy and calculate engagement metrics
engagement_summary = final_df.groupby("accuracy")[["views", "likes", "comments"]].mean()

# Display the engagement summary
print("Average Engagement Metrics by Accuracy Level:")
print(engagement_summary)
Average Engagement Metrics by Accuracy Level:
                         views        likes    comments
accuracy                                               
High Accuracy     59972.483871  1257.870968  133.322581
Low Accuracy     246117.571429  2234.714286  419.714286
Medium Accuracy  162775.208333  2641.083333  425.166667
In [15]:
# Count tone and approach distribution
tone_counts = final_df["tone"].value_counts()
approach_counts = final_df["approach"].value_counts()

# Display tone and approach counts
print("\nTone Distribution:")
print(tone_counts)
print("\nApproach Distribution:")
print(approach_counts)
Tone Distribution:
tone
Negative    66
Positive    34
Name: count, dtype: int64

Approach Distribution:
approach
Other                      52
Scientific/Professional    23
Personal/Experiential      19
Alternative/Promotional     6
Name: count, dtype: int64
In [16]:
# Correlate accuracy and engagement
correlation_summary = final_df.groupby("accuracy")[["views", "likes", "comments"]].sum()

print("\nEngagement Summary by Accuracy Level:")
print(correlation_summary)

# Display the correlation visually (heatmap)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_summary, annot=True, fmt=".0f", cmap="Blues")
plt.title("Engagement Metrics by Accuracy Level")
plt.xlabel("Metrics")
plt.ylabel("Accuracy Level")
plt.tight_layout()
plt.show()
Engagement Summary by Accuracy Level:
                   views  likes  comments
accuracy                                 
High Accuracy    1859147  38994      4133
Low Accuracy     1722823  15643      2938
Medium Accuracy  3906605  63386     10204
No description has been provided for this image
In [17]:
# Function to fetch comments for a video
def fetch_comments(video_id):
    comments = []
    try:
        # Check if comments are enabled by fetching statistics
        video_request = youtube.videos().list(part="statistics", id=video_id)
        video_response = video_request.execute()
        
        comment_count = int(video_response["items"][0]["statistics"].get("commentCount", 0))
        if comment_count == 0:
            return ["Comments Disabled"]
        
        # Fetch comments if they are enabled
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=50  # Adjust as needed
        )
        response = request.execute()
        
        # Extract comment text
        for item in response.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
    except Exception as e:
        print(f"Error fetching comments for video {video_id}: {e}")
        return ["Error Fetching Comments"]
    return comments

# Example usage: Fetch comments for the first video
final_df["comments_text"] = final_df["video_id"].apply(fetch_comments)
In [18]:
# Sentiment analysis function for comments
def analyze_comment_sentiment(comment_list):
    if not isinstance(comment_list, list) or len(comment_list) == 0:
        return "No Sentiment"
    if "Comments Disabled" in comment_list or "Error Fetching Comments" in comment_list:
        return "No Sentiment"
    sentiments = [sentiment_analyzer(comment[:512])[0]["label"] for comment in comment_list]
    # Return the most frequent sentiment
    return max(set(sentiments), key=sentiments.count)

# Apply sentiment analysis on the fetched comments
final_df["comment_sentiment"] = final_df["comments_text"].apply(analyze_comment_sentiment)

# Summarize sentiment distribution
sentiment_counts = final_df["comment_sentiment"].value_counts()
print("\nComment Sentiment Distribution:")
print(sentiment_counts)
Comment Sentiment Distribution:
comment_sentiment
No Sentiment    48
NEGATIVE        45
POSITIVE         7
Name: count, dtype: int64
In [ ]: